library(ISLR2)
attach(Wage)

Basis Function

Most of the non-linear regression can be viewed as the fitting based on the basis functions: \(y_i=\beta_0+\beta_1f_1(x_i)+\dots+\beta_nf_n(x_i)\)

Example 1: Polynomial Regression

\(y_i=\beta_0+\beta_1x_i+\dots+\beta_d(x_i)^d\)

fit1<-lm(wage~poly(age,4))
coef(summary(fit1))
##                 Estimate Std. Error    t value     Pr(>|t|)
## (Intercept)    111.70361  0.7287409 153.283015 0.000000e+00
## poly(age, 4)1  447.06785 39.9147851  11.200558 1.484604e-28
## poly(age, 4)2 -478.31581 39.9147851 -11.983424 2.355831e-32
## poly(age, 4)3  125.52169 39.9147851   3.144742 1.678622e-03
## poly(age, 4)4  -77.91118 39.9147851  -1.951938 5.103865e-02

Notice that the in poly(), the default value is raw=FALSE, which means r will return orthogonal polynomial, which means the each columns of the matrices are simply a linear combination of \(age,age^2,\dots\).

fit2<-lm(wage~poly(age,4,raw=TRUE))
coef(summary(fit2))
##                                Estimate   Std. Error   t value     Pr(>|t|)
## (Intercept)               -1.841542e+02 6.004038e+01 -3.067172 0.0021802539
## poly(age, 4, raw = TRUE)1  2.124552e+01 5.886748e+00  3.609042 0.0003123618
## poly(age, 4, raw = TRUE)2 -5.638593e-01 2.061083e-01 -2.735743 0.0062606446
## poly(age, 4, raw = TRUE)3  6.810688e-03 3.065931e-03  2.221409 0.0263977518
## poly(age, 4, raw = TRUE)4 -3.203830e-05 1.641359e-05 -1.951938 0.0510386498

Notice that although the estimated coefficients between fit1 and fit2 are very different, the fitted values for two models are closed.

#fitted values
agelim<-range(age) # create a 2-dimensional vector of (age.min,age.max)
age.grid<-seq(from=agelim[1],to=agelim[2])
age.grid
##  [1] 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42
## [26] 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67
## [51] 68 69 70 71 72 73 74 75 76 77 78 79 80
list(age=age.grid)
## $age
##  [1] 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42
## [26] 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67
## [51] 68 69 70 71 72 73 74 75 76 77 78 79 80
#fit for raw=F
preds<-predict(fit1,newdata=list(age=age.grid),se=TRUE) #se=TRUE means we want standard errors as well
#fit for raw=T
preds2<-predict(fit2,newdata=list(age=age.grid),se=TRUE)
max(abs(preds$fit-preds2$fit))
## [1] 9.353585e-11
se.bands<-cbind(preds$fit-2*preds$se.fit,preds$fit+2*preds$se.fit)
se.bands[1:10]
##  [1] 41.33491 49.75522 57.38768 64.27111 70.44323 75.94470 80.82540 85.14914
##  [9] 88.99064 92.42467

Recall that standard error of the estimates under simple linear regression is: \(se_{y_j}=\hat{\sigma}\sqrt{\frac{1}{n}+\frac{(x_j-\bar{x_j})^2}{\sum(x_i-\bar{x_i})^2}}\)

#Two equivalent ways of performing polynomial regression
fit2a<-lm(wage~age+I(age^2)+I(age^3)+I(age^4))
coef(summary(fit2a))
##                  Estimate   Std. Error   t value     Pr(>|t|)
## (Intercept) -1.841542e+02 6.004038e+01 -3.067172 0.0021802539
## age          2.124552e+01 5.886748e+00  3.609042 0.0003123618
## I(age^2)    -5.638593e-01 2.061083e-01 -2.735743 0.0062606446
## I(age^3)     6.810688e-03 3.065931e-03  2.221409 0.0263977518
## I(age^4)    -3.203830e-05 1.641359e-05 -1.951938 0.0510386498
fit2b<-lm(wage~cbind(age,age^2,age^3,age^4))
coef(summary(fit2b))
##                                         Estimate   Std. Error   t value
## (Intercept)                        -1.841542e+02 6.004038e+01 -3.067172
## cbind(age, age^2, age^3, age^4)age  2.124552e+01 5.886748e+00  3.609042
## cbind(age, age^2, age^3, age^4)    -5.638593e-01 2.061083e-01 -2.735743
## cbind(age, age^2, age^3, age^4)     6.810688e-03 3.065931e-03  2.221409
## cbind(age, age^2, age^3, age^4)    -3.203830e-05 1.641359e-05 -1.951938
##                                        Pr(>|t|)
## (Intercept)                        0.0021802539
## cbind(age, age^2, age^3, age^4)age 0.0003123618
## cbind(age, age^2, age^3, age^4)    0.0062606446
## cbind(age, age^2, age^3, age^4)    0.0263977518
## cbind(age, age^2, age^3, age^4)    0.0510386498
#plot the results
#mar and oma control the margin of the graph;
par(mfrow=c(1,2),mar=c(4.5,4.5,1,1),oma=c(0,0,4,0))
#cex denotes the size of the data points comparing to the whole graph
plot(age,wage,xlim=agelim,cex=0.5,col="darkgrey")
#title() creates a title for the whole plot
title("Degree-4 polynomial", outer=T)
lines(age.grid,preds$fit,lwd=2,col="blue")
#matlines( ) plot the column of the matrix, here we plot se.bands, which consist of two lines (upper and lower bounds)
#lty denotes the line type
matlines(age.grid,se.bands,lwd=1,col="blue",lty=3)

We now perform the logistic polynomial regression. The model is simply the combination of logistic regression and polynomial regression. Here \(\log(\frac{P(Y=1|X)}{1-P(Y=1|X)})=X\beta\), or equivalently,\(P(Y=1|X)=\frac{\exp{X\beta}}{1+\exp{X\beta}}\) where \(X=(1,x,x^2,\dots)\).

fit<-glm(I(wage>250)~poly(age,4),family=binomial)
preds<-predict(fit,newdata=list(age=age.grid),se=TRUE)
preds$fit
##          1          2          3          4          5          6          7 
## -18.438190 -16.395452 -14.560646 -12.919746 -11.459196 -10.165904  -9.027249 
##          8          9         10         11         12         13         14 
##  -8.031077  -7.165700  -6.419899  -5.782925  -5.244494  -4.794789  -4.424465 
##         15         16         17         18         19         20         21 
##  -4.124640  -3.886904  -3.703311  -3.566386  -3.469120  -3.404973  -3.367870 
##         22         23         24         25         26         27         28 
##  -3.352208  -3.352849  -3.365123  -3.384829  -3.408233  -3.432069  -3.453538 
##         29         30         31         32         33         34         35 
##  -3.470311  -3.480523  -3.482781  -3.476158  -3.460194  -3.434898  -3.400746 
##         36         37         38         39         40         41         42 
##  -3.358682  -3.310119  -3.256936  -3.201481  -3.146569  -3.095483  -3.051975 
##         43         44         45         46         47         48         49 
##  -3.020263  -3.005034  -3.011442  -3.045110  -3.112128  -3.219053  -3.372911 
##         50         51         52         53         54         55         56 
##  -3.581196  -3.851869  -4.193359  -4.614563  -5.124846  -5.734040  -6.452445 
##         57         58         59         60         61         62         63 
##  -7.290830  -8.260431  -9.372951 -10.640562 -12.075903 -13.692081 -15.502672
pfit<-exp(preds$fit)/(1+exp(preds$fit)) #value of the P(Y|X)
pfit
##            1            2            3            4            5            6 
## 9.826427e-09 7.577844e-08 4.746699e-07 2.449201e-06 1.055188e-05 3.845805e-05 
##            7            8            9           10           11           12 
## 1.200780e-04 3.250922e-04 7.720418e-04 1.626171e-03 3.070238e-03 5.248798e-03 
##           13           14           15           16           17           18 
## 8.204864e-03 1.183878e-02 1.591202e-02 2.009659e-02 2.404918e-02 2.748122e-02 
##           19           20           21           22           23           24 
## 3.020374e-02 3.214042e-02 3.331483e-02 3.382293e-02 3.380199e-02 3.340341e-02 
##           25           26           27           28           29           30 
## 3.277296e-02 3.203914e-02 3.130812e-02 3.066352e-02 3.016889e-02 2.987151e-02 
##           31           32           33           34           35           36 
## 2.980614e-02 2.999827e-02 3.046630e-02 3.122244e-02 3.227216e-02 3.361200e-02 
##           37           38           39           40           41           42 
## 3.522567e-02 3.707845e-02 3.911004e-02 4.122669e-02 4.329396e-02 4.513229e-02 
##           43           44           45           46           47           48 
## 4.651881e-02 4.719897e-02 4.691162e-02 4.542905e-02 4.260976e-02 3.845499e-02 
##           49           50           51           52           53           54 
## 3.315287e-02 2.708817e-02 2.079824e-02 1.487100e-02 9.809332e-03 5.911974e-03 
##           55           56           57           58           59           60 
## 3.223561e-03 1.574180e-03 6.812972e-04 2.584807e-04 8.498499e-05 2.392502e-05 
##           61           62           63 
## 5.695077e-06 1.131369e-06 1.850440e-07
se.bands.logit<-cbind(preds$fit+2*preds$se.fit,preds$fit-2*preds$se.fit)
se.bands<-exp(se.bands.logit)/(1+exp(se.bands.logit))
se.bands
##           [,1]         [,2]
## 1  0.002112586 4.560982e-14
## 2  0.002769461 2.067720e-12
## 3  0.003572811 6.283757e-11
## 4  0.004538493 1.315721e-09
## 5  0.005680373 1.949028e-08
## 6  0.007009933 2.095264e-07
## 7  0.008536386 1.675070e-06
## 8  0.010267586 1.019390e-05
## 9  0.012212121 4.828409e-05
## 10 0.014383117 1.817701e-04
## 11 0.016804336 5.546163e-04
## 12 0.019518713 1.396599e-03
## 13 0.022596543 2.951528e-03
## 14 0.026130891 5.320935e-03
## 15 0.030189010 8.328920e-03
## 16 0.034692408 1.156794e-02
## 17 0.039298107 1.462723e-02
## 18 0.043454815 1.727334e-02
## 19 0.046641487 1.944097e-02
## 20 0.048578497 2.114109e-02
## 21 0.049281445 2.239933e-02
## 22 0.048990856 2.323636e-02
## 23 0.048056386 2.367057e-02
## 24 0.046829563 2.373073e-02
## 25 0.045589671 2.347083e-02
## 26 0.044511263 2.297768e-02
## 27 0.043671699 2.236286e-02
## 28 0.043084456 2.174208e-02
## 29 0.042736472 2.121518e-02
## 30 0.042614748 2.085592e-02
## 31 0.042720147 2.071149e-02
## 32 0.043073081 2.080602e-02
## 33 0.043715608 2.114380e-02
## 34 0.044711869 2.171024e-02
## 35 0.046145914 2.247128e-02
## 36 0.048113530 2.337394e-02
## 37 0.050703852 2.435125e-02
## 38 0.053969621 2.533224e-02
## 39 0.057891916 2.625177e-02
## 40 0.062351211 2.705265e-02
## 41 0.067117518 2.767577e-02
## 42 0.071871637 2.804058e-02
## 43 0.076269905 2.802088e-02
## 44 0.080058498 2.742438e-02
## 45 0.083216969 2.599620e-02
## 46 0.086062374 2.348724e-02
## 47 0.089217894 1.982022e-02
## 48 0.093430474 1.528236e-02
## 49 0.099379865 1.054306e-02
## 50 0.107629641 6.386212e-03
## 51 0.118705060 3.338172e-03
## 52 0.133197764 1.480723e-03
## 53 0.151842683 5.478804e-04
## 54 0.175559978 1.660645e-04
## 55 0.205460597 4.044320e-05
## 56 0.242799002 7.752435e-06
## 57 0.288838187 1.144402e-06
## 58 0.344581767 1.271472e-07
## 59 0.410345281 1.038022e-08
## 60 0.485213976 6.073219e-10
## 61 0.566579258 2.481151e-11
## 62 0.650090286 6.889563e-13
## 63 0.730315921 1.264430e-14
#IMPORTANT: We could have directly compute the predicted value simply using predict( ), which equals to pfit
preds<-predict(fit,newdata=list(age=age.grid),type="response",se=T)
preds
## $fit
##            1            2            3            4            5            6 
## 9.826427e-09 7.577844e-08 4.746699e-07 2.449201e-06 1.055188e-05 3.845805e-05 
##            7            8            9           10           11           12 
## 1.200780e-04 3.250922e-04 7.720418e-04 1.626171e-03 3.070238e-03 5.248798e-03 
##           13           14           15           16           17           18 
## 8.204864e-03 1.183878e-02 1.591202e-02 2.009659e-02 2.404918e-02 2.748122e-02 
##           19           20           21           22           23           24 
## 3.020374e-02 3.214042e-02 3.331483e-02 3.382293e-02 3.380199e-02 3.340341e-02 
##           25           26           27           28           29           30 
## 3.277296e-02 3.203914e-02 3.130812e-02 3.066352e-02 3.016889e-02 2.987151e-02 
##           31           32           33           34           35           36 
## 2.980614e-02 2.999827e-02 3.046630e-02 3.122244e-02 3.227216e-02 3.361200e-02 
##           37           38           39           40           41           42 
## 3.522567e-02 3.707845e-02 3.911004e-02 4.122669e-02 4.329396e-02 4.513229e-02 
##           43           44           45           46           47           48 
## 4.651881e-02 4.719897e-02 4.691162e-02 4.542905e-02 4.260976e-02 3.845499e-02 
##           49           50           51           52           53           54 
## 3.315287e-02 2.708817e-02 2.079824e-02 1.487100e-02 9.809332e-03 5.911974e-03 
##           55           56           57           58           59           60 
## 3.223561e-03 1.574180e-03 6.812972e-04 2.584807e-04 8.498499e-05 2.392502e-05 
##           61           62           63 
## 5.695077e-06 1.131369e-06 1.850440e-07 
## 
## $se.fit
##            1            2            3            4            5            6 
## 6.033653e-08 3.981824e-07 2.119358e-06 9.220159e-06 3.320723e-05 1.002277e-04 
##            7            8            9           10           11           12 
## 2.564800e-04 5.626547e-04 1.069479e-03 1.779943e-03 2.622739e-03 3.466448e-03 
##           13           14           15           16           17           18 
## 4.181423e-03 4.716338e-03 5.128401e-03 5.523590e-03 5.947782e-03 6.344487e-03 
##           19           20           21           22           23           24 
## 6.614304e-03 6.691149e-03 6.573028e-03 6.312234e-03 5.988403e-03 5.680108e-03 
##           25           26           27           28           29           30 
## 5.442968e-03 5.299325e-03 5.241697e-03 5.245939e-03 5.285364e-03 5.339675e-03 
##           31           32           33           34           35           36 
## 5.398309e-03 5.460737e-03 5.536090e-03 5.643051e-03 5.809586e-03 6.070929e-03 
##           37           38           39           40           41           42 
## 6.463916e-03 7.017241e-03 7.740416e-03 8.616431e-03 9.602075e-03 1.063780e-02 
##           43           44           45           46           47           48 
## 1.166801e-02 1.267023e-02 1.368205e-02 1.479666e-02 1.609138e-02 1.750089e-02 
##           49           50           51           52           53           54 
## 1.873185e-02 1.931792e-02 1.880891e-02 1.699664e-02 1.405653e-02 1.051435e-02 
##           55           56           57           58           59           60 
## 7.039302e-03 4.176838e-03 2.175193e-03 9.842325e-04 3.828424e-04 1.265771e-04 
##           61           62           63 
## 3.514927e-05 8.095793e-06 1.526511e-06 
## 
## $residual.scale
## [1] 1
plot(age,I(wage>250),xlim=agelim,type="n",ylim=c(0,0.2))
#jitter( ) let each point to be "wider" a little bit
points(jitter(age),I((wage>250)/5),cex=0.5,pch="l",col="darkgrey")
lines(age.grid,pfit,lwd=2,col="blue")
matlines(age.grid,se.bands,lwd=1,col="blue",lty=3)

Example 2: Step Function

Let \(C_0(X)=I(X<c_1),\dots,C_{i}(X)=I(c_i<X<c_{i+1}),\dots C_k=I(X<c_k)\) \(y_i=\beta_0+\beta_1C_1(x_i)+\dots+\beta_KC_n(X_K)\)

#We use cut( ) function to cut the whole intervals into pieces
cut(age,4)[1:10] #If we express in this way, we can view which interval each point fall intp
##  [1] (17.9,33.5] (17.9,33.5] (33.5,49]   (33.5,49]   (49,64.5]   (49,64.5]  
##  [7] (33.5,49]   (17.9,33.5] (33.5,49]   (49,64.5]  
## Levels: (17.9,33.5] (33.5,49] (49,64.5] (64.5,80.1]
table(cut(age,4)) #If we express in this way, we can see which intervals are cut into
## 
## (17.9,33.5]   (33.5,49]   (49,64.5] (64.5,80.1] 
##         750        1399         779          72
fit<-lm(wage~cut(age,4))
coef(summary(fit))
##                         Estimate Std. Error   t value     Pr(>|t|)
## (Intercept)            94.158392   1.476069 63.789970 0.000000e+00
## cut(age, 4)(33.5,49]   24.053491   1.829431 13.148074 1.982315e-38
## cut(age, 4)(49,64.5]   23.664559   2.067958 11.443444 1.040750e-29
## cut(age, 4)(64.5,80.1]  7.640592   4.987424  1.531972 1.256350e-01
preds<-predict(fit,newdata=list(age=age.grid))
par(mfrow=c(1,2),mar=c(4.5,4.5,1,1),oma=c(0,0,4,0))
plot(age,wage,xlim=agelim,cex=0.5,col="darkgrey")
title("Step Function", outer=T)
lines(age.grid,preds,lwd=2,col="blue")

Here is a detailed explanation of vector, list and table.

1.vector: vector is in fact “atomatic vector for short.” It’s a list consisting only of numerical values or characters, or logic, etc.

2.list: list can be viewed as the flexible vector. It can contain numbers, characters, and vectors. Every vector is a list, but the converse is not true.

3.table: table is a tabulation helping us to view the level and the frequency of each level appeared in a categorical variable.

Regression Spline

Piecewise Polynomial Regression

The motivation behind regression spline is the combination of polynomial regression and step function, which is called piecewise polynomial regression, very basic but providing great degree of flexibility.

A simple example of piecewise polynomial regression with two pieces is:

\(y_i= \beta_{01}+\beta_{11}x_i+\dots+\beta_{31}(x_i)^3\) if \(x_i<c\), \(y_i= \beta_{02}+\beta_{12}x_i+\dots+\beta_{32}(x_i)^3\) if \(x_i \geq c\)

However, one obvious limit with piecewise polynomial regression is that it’s discontinuous at the joint point, and when we add the continuity requirement, it becomes the famous “spline.”

Spline

The simplest example of the spline is the linear spline. Basically, it’s a continuous piecewise polynomial function. The basis representation of this funtcion is:

\(y_i=\beta_0+\beta_1b_1(x_i)+\beta_2b_2(x_i)+\dots+\beta_{K+1}b_{K+1}(X_i)\) and \(b_1(x)=x\) and \(b_{k+1}(x)=(x-\epsilon_k)_{+}\), where \(e_k\) is the kth knot.

Another important example is cubic spline with the basis representation: \(y_i=\beta_0+\beta_1b_1(x_i)+\beta_2b_2(x_i)+\dots+\beta_{K+3}b_{K+3}(X_i)\) and \(b_1(x)=x,b_2(x)=x^2,b_3(x)=x^3,b_{k+3}=(x-\epsilon_k)^3\).

#We use bs( ), which generates the entire matrix of the basis function for splines, to perform spline regression, which is in splines library.
library(splines)
fit<-lm(wage~bs(age,knots=c(25,40,60)))
pred<-predict(fit,newdata=list(age=age.grid),se=T)
se.bands<-cbind(pred$fit-2*pred$se,pred$fit+2*pred$se)
plot(age,wage,col="darkgrey")
lines(age.grid,pred$fit,lwd=2,col="red")
matlines(age.grid,se.bands,lwd=1,col="red",lty=3)

#Alternatively, 
#lines(age,grid,pred$fit+2*pred$se,lty="dashed")
#lines(age,grid,pred$fit-2*pred$se,lty="dashed")

Notice that although we only require linear spline to be continuous, the cubic spline is in fact \(\mathcal{C}^2\). We don’t get in detail, but the degree of freedom for cubic spline is K+4.

#A verification of the degree of freedom relating to cubic spline (K+4)
dim(bs(age,knots=c(25,40,60))) #1,x,x^2,x^3,each knot has an additional (x-ek)^3, so 4+3=7 functions in total (df=7). Here "6" didn't count the intercept term. 4+3-1=6
## [1] 3000    6
dim(bs(age,df=7))
## [1] 3000    7
attr(bs(age,df=7),"knots")
## 20% 40% 60% 80% 
##  32  39  46  53
#CAREFUL: df=7 actually means there are 7 basis functions except the intercept term->K+4-1=7, so there are 4 dots

Sometimes we put extra constraint on cubic spline, which results in “natural cubic spline.” The constraint is that on the leftmost and rightmost pieces, we require the function to be linear (second derivative=0). Natural cubic spline will have narrower confidence interval on the two side comparing with cubic spline. The degree of freedom for natural cubic splie is simply K.

#Similarly, we use ns( ) to perform natural cubic spline
attr(ns(age,df=4),"knots")
##   25%   50%   75% 
## 33.75 42.00 51.00
fit2<-lm(wage~ns(age,df=4),data=Wage) #just remember, when using ns(), deg of freedom = number of intervals
summary(fit2)
## 
## Call:
## lm(formula = wage ~ ns(age, df = 4), data = Wage)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -98.737 -24.477  -5.083  15.371 204.874 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        58.556      4.235  13.827   <2e-16 ***
## ns(age, df = 4)1   60.462      4.190  14.430   <2e-16 ***
## ns(age, df = 4)2   41.963      4.372   9.597   <2e-16 ***
## ns(age, df = 4)3   97.020     10.386   9.341   <2e-16 ***
## ns(age, df = 4)4    9.773      8.657   1.129    0.259    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 39.92 on 2995 degrees of freedom
## Multiple R-squared:  0.08598,    Adjusted R-squared:  0.08476 
## F-statistic: 70.43 on 4 and 2995 DF,  p-value: < 2.2e-16
pred2<-predict(fit2,newdata=list(age=age.grid),se=T)
se.bands2<-cbind(pred2$fit-2*pred2$se,pred2$fit+2*pred2$se)
plot(age,wage,col="darkgrey")
lines(age.grid,pred$fit,lwd=2,col="red")
lines(age.grid,pred2$fit,col="blue",lwd=2)
matlines(age.grid,se.bands,lwd=1,col="red",lty=3)
matlines(age.grid,se.bands2,lwd=1,col="blue",lty=3)

# red is cubic spline; blue is natural cubic spline

Smooth spline is another way to improve the model of piecewise polynomial regression. Different from the normal setting of least square minimization, we try to minimize: \(\sum_{i=1}^n(y_i-g(x_i))^2+\lambda \int g''(t)dt\). The penalty term can be viewed as the wiggling of the graph. By choosing the proper tuning parameter \(\lambda\), we can balance the fitting with the smoothness of the graph. In fact, it can be proved that smooth spline is in fact a natural cubic spline with knots at every observation point \(x_i\).

It sounds like smooth spline has a huge degree of freedom. Here we introduce the concept of effective degree of freedom. Let \(\hat{g}_{\lambda}=(\hat{x}_1,\dots, \hat{x}_n)\) be the solution to a particular choice of \(\lambda\), i.e., it’s a vector containing the fitted value at each observation. In fact, there exists a matrix \(S_{\lambda}\) so that \(\hat{g}_{\lambda}=S_{\lambda}y\), and the effective degree of freedom related to \(\lambda\) is defined to be the trace of \(S_{\lambda}\), i.e., \(df_{\lambda}=\sum_{i=1}^n (S_{\lambda})_{ii}\). The effective degree of freedom can be viewed as the number of parameters that have a real effect on the fitting of the smooth spline. Thus different from the previous setting–the fitting depending on the choosing of the position and number of the knots–, here the problem is to properly choose the tuning parameter \(\lambda\), and in fact, if we use Leave-out-one Cross-Validation to choose \(\lambda\), we are actually solving:

\(\min_{\lambda} RSS_{cv} (\lambda)=\min_{\lambda} \sum_{i=1}^n (y_i-\hat{g}_{\lambda}^{-i}(x_i))^2=\min_{\lambda} \sum_{i=1}^n[\frac{y_i-\hat{g}_{\lambda}(x_i)}{1-(S_{\lambda})_{ii}}]^2\)

Here \(\hat{g}_{\lambda}^{-i}\) indicates the fitted value at \(x_i\) of the smooth spline trained by all the observations except the point \(x_i\), while \(g_{\lambda}\) here is the smooth spline trained by all the observation. (We can compare this formula to the one we mentioned in Resampling Methods: \(CV_{(n)}=\frac{1}{n}\sum_{i=1}^n(\frac{y_i-\hat{y}_i}{1-h_i})^2\))

#We use smooth.spline( ) to perform smooth spline regression
plot(age,wage,xlim=agelim,cex=0.5,col="darkgrey")
title("Smoothing Spline")
fit<-smooth.spline(age,wage,df=16)
fit2<-smooth.spline(age,wage,cv=TRUE)
## Warning in smooth.spline(age, wage, cv = TRUE): cross-validation with non-unique
## 'x' values seems doubtful
fit2$df
## [1] 6.794596
lines(fit,col="red",lwd=2)
lines(fit2,col="blue",lwd=2)
#legend图例
legend("topright",legend=c("16 DF","6.8 DF"), col=c("red","blue"),lty=1,lwd=2,cex=0.8)

Local Regression

Local regression is a different approach for fitting flexible nonlinear functions, which involves computing the fit at a target point \(x_0\) only using nearby observations. We use a graph to illustrate the idea:

Alt text Here the solid orange point is the observation we try to fit with a local (simple) regression, and the hollow orange points are the points we use to train to regression, which means there are the nearest. The bell-shape yellow shade indicates the weights we put on each point.

plot(age,wage,xlim=agelim,cex=0.5,col="darkgrey")
title("Local Regression")
fit1<-loess(wage~age,span=0.2)
summary(fit)
##            Length Class             Mode   
## x          61     -none-            numeric
## y          61     -none-            numeric
## w          61     -none-            numeric
## yin        61     -none-            numeric
## tol         1     -none-            numeric
## data        3     -none-            list   
## no.weights  1     -none-            logical
## n           1     -none-            numeric
## lev        61     -none-            numeric
## cv          1     -none-            logical
## cv.crit     1     -none-            numeric
## pen.crit    1     -none-            numeric
## crit        1     -none-            numeric
## df          1     -none-            numeric
## spar        1     -none-            numeric
## ratio       1     -none-            numeric
## lambda      1     -none-            numeric
## iparms      5     -none-            numeric
## auxM        0     -none-            NULL   
## fit         5     smooth.spline.fit list   
## call        4     -none-            call
summary(fit1)
## Call:
## loess(formula = wage ~ age, span = 0.2)
## 
## Number of Observations: 3000 
## Equivalent Number of Parameters: 16.42 
## Residual Standard Error: 39.92 
## Trace of smoother matrix: 18.15  (exact)
## 
## Control settings:
##   span     :  0.2 
##   degree   :  2 
##   family   :  gaussian
##   surface  :  interpolate      cell = 0.2
##   normalize:  TRUE
##  parametric:  FALSE
## drop.square:  FALSE
fit2<-loess(wage~age,span=0.5)
p<-predict(fit1,newdata=data.frame(age=age.grid),se=TRUE)
p
## $fit
##         1         2         3         4         5         6         7         8 
##  58.18419  62.22773  66.31587  70.46182  74.67881  79.01912  83.34946  87.91885 
##         9        10        11        12        13        14        15        16 
##  92.39689  96.23599  99.39110 101.68931 102.96082 103.90136 104.52324 106.50078 
##        17        18        19        20        21        22        23        24 
## 110.55537 112.02262 114.43434 119.23533 121.23858 121.60981 115.37416 115.91396 
##        25        26        27        28        29        30        31        32 
## 119.55518 121.17026 120.21935 120.66119 122.92292 118.56722 116.99702 115.89697 
##        33        34        35        36        37        38        39        40 
## 115.98670 114.96864 115.88492 117.69579 118.61631 118.64199 118.12423 118.58307 
##        41        42        43        44        45        46        47        48 
## 119.22974 120.24453 120.84083 120.41871 119.56593 118.24917 116.63979 114.93753 
##        49        50        51        52        53        54        55        56 
## 113.15165 111.29138 109.36598 107.38468 105.35674 103.29140 101.19791  99.08550 
##        57        58        59        60        61        62        63 
##  96.96344  94.84095  92.72730  90.63172  88.56346  86.53176  84.54588 
## 
## $se.fit
##         1         2         3         4         5         6         7         8 
##  7.604138  5.561711  4.054364  3.133306  2.740812  2.628107  2.611909  2.604196 
##         9        10        11        12        13        14        15        16 
##  2.765845  2.623279  2.810065  2.681493  2.623566  2.835921  2.743862  2.671516 
##        17        18        19        20        21        22        23        24 
##  2.703870  2.769818  2.772467  2.704529  2.608160  2.901387  2.778697  2.856916 
##        25        26        27        28        29        30        31        32 
##  2.918231  2.879977  2.880003  2.931555  2.974818  2.914126  2.924498  2.972884 
##        33        34        35        36        37        38        39        40 
##  2.558216  2.641553  2.812773  2.900072  2.956161  2.697350  2.805771  2.651901 
##        41        42        43        44        45        46        47        48 
##  2.828436  2.711978  2.732000  2.743024  2.843084  2.995795  3.138973  3.296557 
##        49        50        51        52        53        54        55        56 
##  3.448649  3.592454  3.738714  3.909302  4.134691  4.449962  4.889210  5.480140 
##        57        58        59        60        61        62        63 
##  6.241339  7.182992  8.309647  9.623109 11.124484 12.815287 14.697907 
## 
## $residual.scale
## [1] 39.91836
## 
## $df
## [1] 2980.963
#Although I didn't figure it out, the result is that loess() might be slightly different from other similar functions. I guess when predicting the fit from local regression, one needs to turn a list into numercial values or what, then list causes the error "cannot be coerced to double." In previous cases, newdata=list( ) can be replaced by data.frame( ) 
lines(age.grid,predict(fit1,data.frame(age=age.grid)),col="red",lwd=2)
lines(age.grid,predict(fit2,data.frame(age=age.grid)),col="blue",lwd=2)
legend("topright",legend=c("Span=0.2","Span=0.5"),col=c("red","blue"),lty=1,lwd=2,cex=0.8)

Generalized Additive Model (GAM)

The natural way of to extend the basic multiple regression is to turn the model: \(y_i=\beta_0+\beta_1x_{i1}+\dots+\beta_px_{ip}+\epsilon_i\) into \(y_i=\beta_0+\beta_1f_1(x_{i1})+\dots+\beta_pf_p(x_{ip})+\epsilon_i\), which is called generalized additive model.

However, an obvious limit of GAM is that we assume the relation between each predictors are additivea and the model is linear. The underlying truth might not be true. However, it’s easy to solve by simply adding some new predictors. (for example, adding an additive term)

#first see the performance of linear regression with some basis functions
gam1<-lm(wage~ns(year,4)+ns(age,5)+education)
# We perform GAM using gam( ) in gam library
#s() denotes we want to use smooth spline on each individual predictor. 
library(gam)
## Loading required package: foreach
## Loaded gam 1.22
gam.m3<-gam(wage~s(year,4)+s(age,5)+education)
par(mfrow=c(1,3))
plot(gam.m3,se=TRUE,col="blue")

#However, in fact to plot GAM graph, we need to use plot.Gam, here plot recognize GAM and automatically call plot.Gam
plot.Gam(gam1,se=TRUE,col="red")

#Comparing different models using ANOVA
gam.m1<-gam(wage~s(age,5)+education)
gam.m2<-gam(wage~year+s(age,5)+education)
anova(gam.m1,gam.m2,gam.m3)
## Analysis of Deviance Table
## 
## Model 1: wage ~ s(age, 5) + education
## Model 2: wage ~ year + s(age, 5) + education
## Model 3: wage ~ s(year, 4) + s(age, 5) + education
##   Resid. Df Resid. Dev Df Deviance  Pr(>Chi)    
## 1      2990    3711731                          
## 2      2989    3693842  1  17889.2 0.0001419 ***
## 3      2986    3689770  3   4071.1 0.3483897    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#The result from anova shows that the model with term year is significantly better
summary(gam.m3)
## 
## Call: gam(formula = wage ~ s(year, 4) + s(age, 5) + education)
## Deviance Residuals:
##     Min      1Q  Median      3Q     Max 
## -119.43  -19.70   -3.33   14.17  213.48 
## 
## (Dispersion Parameter for gaussian family taken to be 1235.69)
## 
##     Null Deviance: 5222086 on 2999 degrees of freedom
## Residual Deviance: 3689770 on 2986 degrees of freedom
## AIC: 29887.75 
## 
## Number of Local Scoring Iterations: NA 
## 
## Anova for Parametric Effects
##              Df  Sum Sq Mean Sq F value    Pr(>F)    
## s(year, 4)    1   27162   27162  21.981 2.877e-06 ***
## s(age, 5)     1  195338  195338 158.081 < 2.2e-16 ***
## education     4 1069726  267432 216.423 < 2.2e-16 ***
## Residuals  2986 3689770    1236                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Anova for Nonparametric Effects
##             Npar Df Npar F  Pr(F)    
## (Intercept)                          
## s(year, 4)        3  1.086 0.3537    
## s(age, 5)         4 32.380 <2e-16 ***
## education                            
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#Note that we can directly use predict to get the predicted value. Note that this time we fit on all the variables, not just age
preds<-predict(gam.m2,newdata=Wage)
#Preciously we use smooth spline for to regress over each single predictor, now we could also use local regression
gam.lo<-gam(wage~s(year,df=4)+lo(age,span=0.7)+education)
plot.Gam(gam.lo,se=TRUE,col="blue")

gam.lo.i<-gam(wage~lo(year,age,span=0.5)+education)
## Warning in lo.wam(x, z, wz, fit$smooth, which, fit$smooth.frame, bf.maxit, : liv
## too small. (Discovered by lowesd)
## Warning in lo.wam(x, z, wz, fit$smooth, which, fit$smooth.frame, bf.maxit, : lv
## too small. (Discovered by lowesd)
## Warning in lo.wam(x, z, wz, fit$smooth, which, fit$smooth.frame, bf.maxit, : liv
## too small. (Discovered by lowesd)
## Warning in lo.wam(x, z, wz, fit$smooth, which, fit$smooth.frame, bf.maxit, : lv
## too small. (Discovered by lowesd)
library(akima)

#akima could draw multidimensional graph. In model gam.lo.i, we fit a local regression surface on year and age term, which records the interaction between them as well
plot(gam.lo.i)

#logistic GAM
gam.lr<-gam(I(wage>250)~year+s(age,df=5)+education,family=binomial)
par(mfrow=c(1,3))
plot(gam.lr,se=T,col="green")

table(education,I(wage>250))
##                     
## education            FALSE TRUE
##   1. < HS Grad         268    0
##   2. HS Grad           966    5
##   3. Some College      643    7
##   4. College Grad      663   22
##   5. Advanced Degree   381   45
gam.lr.s<-gam(I(wage>250)~year+s(age,df=5)+education,family=binomial,subset=(education!="1.<HS Grad"))
plot(gam.lr.s,se=T,col="green")